In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
netflix = pd.read_csv('netflix_titles.csv')
In [3]:
data = netflix.copy()
In [4]:
netflix.shape
Out[4]:
(8807, 12)
In [5]:
netflix.head()
Out[5]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States September 25, 2021 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmm...
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t...
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN September 24, 2021 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor...
3 s4 TV Show Jailbirds New Orleans NaN NaN NaN September 24, 2021 2021 TV-MA 1 Season Docuseries, Reality TV Feuds, flirtations and toilet talk go down amo...
4 s5 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... In a city of coaching centers known to train I...
In [6]:
netflix.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
In [7]:
netflix.isna().sum()
Out[7]:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64
In [8]:
for i in netflix.columns:
  na=netflix[i].isna().sum()
  percent=na/len(netflix)*100
In [9]:
netflix = netflix.dropna(subset=['cast','country','date_added','rating','duration'],how='any')
In [10]:
netflix['director'] = netflix['director'].fillna(value='Not Found')
In [11]:
netflix.isna().sum()
Out[11]:
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64
In [12]:
netflix.drop('description',axis=1,inplace=True)
In [13]:
netflix.date_added = pd.to_datetime(netflix.date_added)
In [14]:
def split_data(col1,col2):
    data1 = data[col1].str.split(',',expand=False)
    data2 = data[col2].squeeze()
    df = pd.concat([data1,data2],axis=1).explode(col1).reset_index(drop=True)   
    df[col1] = df[col1].str.strip()
    return df.value_counts().reset_index()
In [15]:
df1 = split_data('country','type').rename(columns={ 0 :"count"})
df2 = split_data('listed_in','type').rename(columns={ 0 :"count"})
In [16]:
df1
Out[16]:
country type count
0 United States Movie 2752
1 India Movie 962
2 United States TV Show 938
3 United Kingdom Movie 534
4 Canada Movie 319
... ... ... ...
179 Croatia TV Show 1
180 Paraguay Movie 1
181 Jamaica Movie 1
182 Samoa Movie 1
183 Liechtenstein Movie 1

184 rows × 3 columns

In [17]:
df2
Out[17]:
listed_in type count
0 International Movies Movie 2752
1 Dramas Movie 2427
2 Comedies Movie 1674
3 International TV Shows TV Show 1351
4 Documentaries Movie 869
5 Action & Adventure Movie 859
6 TV Dramas TV Show 763
7 Independent Movies Movie 756
8 Children & Family Movies Movie 641
9 Romantic Movies Movie 616
10 TV Comedies TV Show 581
11 Thrillers Movie 577
12 Crime TV Shows TV Show 470
13 Kids' TV TV Show 451
14 Docuseries TV Show 395
15 Music & Musicals Movie 375
16 Romantic TV Shows TV Show 370
17 Horror Movies Movie 357
18 Stand-Up Comedy Movie 343
19 Reality TV TV Show 255
20 British TV Shows TV Show 253
21 Sci-Fi & Fantasy Movie 243
22 Sports Movies Movie 219
23 Anime Series TV Show 176
24 Spanish-Language TV Shows TV Show 174
25 TV Action & Adventure TV Show 168
26 Korean TV Shows TV Show 151
27 Classic Movies Movie 116
28 LGBTQ Movies Movie 102
29 TV Mysteries TV Show 98
30 Science & Nature TV TV Show 92
31 TV Sci-Fi & Fantasy TV Show 84
32 TV Horror TV Show 75
33 Anime Features Movie 71
34 Cult Movies Movie 71
35 Teen TV Shows TV Show 69
36 Faith & Spirituality Movie 65
37 Movies Movie 57
38 TV Thrillers TV Show 57
39 Stand-Up Comedy & Talk Shows TV Show 56
40 Classic & Cult TV TV Show 28
41 TV Shows TV Show 16
In [18]:
df3 = netflix.groupby(['rating']).size().reset_index(name='counts')
pieChart = px.pie(df3, values='counts', names='rating',
title='Distribution of Content Ratings on Netflix')
pieChart.show()
In [19]:
df3
Out[19]:
rating counts
0 G 40
1 NC-17 3
2 NR 62
3 PG 275
4 PG-13 470
5 R 779
6 TV-14 1755
7 TV-G 158
8 TV-MA 2657
9 TV-PG 653
10 TV-Y 209
11 TV-Y7 222
12 TV-Y7-FV 4
13 UR 3
In [20]:
netflix['cast']=netflix['cast'].fillna('No Cast Specified')
filtered_cast=pd.DataFrame()
filtered_cast=netflix['cast'].str.split(',',expand=True).stack()
filtered_cast=filtered_cast.to_frame()
filtered_cast.columns=['Actor']
actors=filtered_cast.groupby(['Actor']).size().reset_index(name='Total Content')
actors=actors[actors.Actor !='No Cast Specified']
actors=actors.sort_values(by=['Total Content'],ascending=False)
actorsTop5=actors.head()
actorsTop5=actorsTop5.sort_values(by=['Total Content'])
actorsTop5
Out[20]:
Actor Total Content
36137 Shah Rukh Khan 25
22568 Paresh Rawal 25
22277 Om Puri 27
28537 Takahiro Sakurai 28
2461 Anupam Kher 39
In [21]:
temp = list()
clean_data = data.dropna()
clean_data.reset_index(inplace=True)
for ind, element in clean_data.iterrows():
    type_show = element['type']
    for director in str(element['director']).split(','):
        temp.append([type_show, director])
director_data = pd.DataFrame(temp, columns= ['type', 'director'])
director_data
Out[21]:
type director
0 Movie Haile Gerima
1 TV Show Andy Devonshire
2 Movie Theodore Melfi
3 Movie Christian Schwochow
4 Movie S. Shankar
... ... ...
5955 Movie Majid Al Ansari
5956 Movie David Fincher
5957 Movie Ruben Fleischer
5958 Movie Peter Hewitt
5959 Movie Mozez Singh

5960 rows × 2 columns

In [22]:
director_data_count = director_data.value_counts().to_frame()
director_data_count.reset_index(level=[0,1], inplace=True)
famous_director = director_data_count.rename(columns={0:'count'})
famous_director
Out[22]:
type director count
0 Movie Jan Suter 18
1 Movie Raúl Campos 18
2 Movie Jay Karas 15
3 Movie Marcus Raboy 14
4 Movie Cathy Garcia-Molina 13
... ... ... ...
4443 Movie Tony Bancroft 1
4444 Movie Lasja Fauzia Susatyo 1
4445 Movie Tony Datis 1
4446 Movie Lars Klevberg 1
4447 TV Show Ziad Doueiri 1

4448 rows × 3 columns

In [23]:
for unique_type in famous_director['type'].unique():
    bar, ax = plt.subplots(figsize=(10,10))
    sns.barplot(x = 'director', y = 'count', data = famous_director[famous_director['type'] == unique_type].iloc[:5])
    plt.xlabel('Director in {}'.format(str(unique_type)))
    plt.ylabel('Frequency')
    plt.title('Famous Director in {}'.format(str(unique_type)), size=20)
In [24]:
movie_data = netflix[netflix['type'] == 'Movie']
tv_show_data = netflix[netflix['type'] == 'TV Show']
# bar,ax = plt.subplots(1,2,figsize=(10,10))
temp = netflix[['type', 'release_year']]
temp = temp.value_counts().to_frame()
temp.reset_index(level=[0,1], inplace=True)
temp = temp.rename(columns = {0:'count'})
temp = pd.concat([temp[temp['type'] == 'Movie'][:5], temp[temp['type']== 'TV Show'][:5]])
In [25]:
# ax, bar = plt.subplots(figsize = (10,10))
sns.catplot(x = 'release_year', y = 'count', hue = 'type', data = temp, kind = 'point')
plt.xlabel('Release Year')
plt.ylabel('Frequency')
plt.title('Growth of Movie/TV Show over Years', size=14)
Out[25]:
Text(0.5, 1.0, 'Growth of Movie/TV Show over Years')
In [26]:
df4=netflix[['type','release_year']]
df4=df4.rename(columns={"release_year": "Release Year"})
df5=df4.groupby(['Release Year','type']).size().reset_index(name='Total Content')
df5=df5[df5['Release Year']>=2010]
df5
Out[26]:
Release Year type Total Content
90 2010 Movie 140
91 2010 TV Show 29
92 2011 Movie 137
93 2011 TV Show 37
94 2012 Movie 162
95 2012 TV Show 49
96 2013 Movie 196
97 2013 TV Show 53
98 2014 Movie 237
99 2014 TV Show 73
100 2015 Movie 344
101 2015 TV Show 128
102 2016 Movie 574
103 2016 TV Show 177
104 2017 Movie 649
105 2017 TV Show 213
106 2018 Movie 653
107 2018 TV Show 282
108 2019 Movie 513
109 2019 TV Show 309
110 2020 Movie 417
111 2020 TV Show 327
112 2021 Movie 152
113 2021 TV Show 183
In [27]:
df4=netflix[['type','release_year']]
df4=df4.rename(columns={"release_year": "Release Year"})
df5=df4.groupby(['Release Year','type']).size().reset_index(name='Total Content')
df5=df5[df5['Release Year']>=2010]
fig3 = px.line(df5, x="Release Year", y="Total Content", color='type',title='Trend of content produced over the year')
fig3.show()
In [ ]: